#Run the code below only once to install the INLA package.
# install.packages("INLA",repos=c(getOption("repos"),INLA="https://inla.r-inla-download.org/R/stable"), dep=TRUE)
require(tidyverse)
require(terra)
require(tidyterra)
require(spdep)
require(mgcv)
require(vegan)
require(spatialreg)
require(INLA)
require(deldir)
require(dismo)
require(lme4)
require(MASS)
require(nlme)
Today we are going to explore solutions when residual spatial dependence exists in your data after fitting your best models. In these situations, model-based ajustments are going to be required to make accurate inference. We are going to be working with data from the Northern Region Landbird Monitoring Program (for more information, see Hutto & Young 2002 [https://www.jstor.org/stable/3784226]). Sampling occurs at 100 m radius sites along transects in Montana and Idaho. The transects are randomly distributed within USFWS lands, but the sampling sites on the transects are placed regularly every 300 m with approximately 10 sites on every transect. At each site, observers conduct 10-minute point counts where they record all birds seen or heard. We are going to be looking at the occurrence of Varied Thrushes (Ixoreus naevius) across an elevation gradient. For our purposes, we will consider that each site was sampled exactly once and ignoring the problem of imperfect detection.
First we are going to read in the elevation raster and sampling locations.
elev = rast('https://github.com/ValenteJJ/SpatialEcology/raw/main/Week7/elevation.tif')
vath = read.csv('https://raw.githubusercontent.com/ValenteJJ/SpatialEcology/main/Week7/vath_2004.csv')
ggplot()+
geom_raster(data=elev, aes(x=x, y=y, fill=elev_km))+
scale_fill_gradientn(colours = terrain.colors(7))+
geom_point(data=vath, aes(x=EASTING, y=NORTHING))+
coord_equal()+
theme_bw()+
theme(panel.grid=element_blank())
From the elevation raster, we can calculate the slope and aspect of each cell. Slope tells us how steep the terrain is at that particular point in space, and aspect tells us what direction the slope is facing.
slope = terrain(elev, v='slope', neighbors=8, unit='degrees')
ggplot(slope, aes(x=x, y=y, fill=slope))+
scale_fill_gradientn(colours = terrain.colors(7))+
geom_raster()+
coord_equal()+
theme_bw()+
theme(panel.grid=element_blank())
aspect = terrain(elev, v='aspect', neighbors=8, unit='degrees')
ggplot(aspect, aes(x=x, y=y, fill=aspect))+
scale_fill_gradientn(colours = terrain.colors(7))+
geom_raster()+
coord_equal()+
theme_bw()+
theme(panel.grid=element_blank())
Finally, we can combine all three elevation variables (elevation, slope, and aspect) into a bundled raster or raster “stack.”
elevVars = c(elev, slope, aspect)
names(elevVars) = c('elev', 'slope', 'aspect')
The extract() function in the terra package overlays the points on the raster stack and extracts the values of the rasters at each point location.
coords = cbind(vath$EASTING, vath$NORTHING)
landCovs = extract(elevVars, coords)
vath = cbind(vath, landCovs)
We’re going to fit 3 generalized linear models that assume a binomial response variable distribution. In other words, we are going to fit logistic regression models to evaluate the probability of recording a Varied Thrush as a function of elevation-related covariates.
# VATH presence is a linear function of elevation
vathElev = glm(VATH ~ elev, family='binomial', data=vath)
# VATH presence is a quadratic function of elevation
vathElev2 = glm(VATH ~ elev + I(elev^2), family='binomial', data=vath)
# Vath presence is a function of elevation, slope, and aspect
vathAll = glm(VATH ~ elev + slope + aspect, family='binomial', data=vath)
round(AIC(vathElev, vathElev2, vathAll), 2)
It’s pretty clear from the AIC model comparison that the quadratic elevation model has the most support. We can look at the output.
summary(vathElev2)
##
## Call:
## glm(formula = VATH ~ elev + I(elev^2), family = "binomial", data = vath)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.984 1.990 -4.012 6.01e-05 ***
## elev 10.698 3.227 3.316 0.000915 ***
## I(elev^2) -4.476 1.281 -3.494 0.000475 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 584.34 on 804 degrees of freedom
## Residual deviance: 560.54 on 802 degrees of freedom
## AIC: 566.54
##
## Number of Fisher Scoring iterations: 6
Because the linear elevation term is positive and the quadratic term is negative, this implies that Varied Thrush occupancy probability peaks at intermediate elevations. We can now plot this relationship to visualize it by creating a new dataset and predicting values based on our fitted model.
glmPred = data.frame(elev = seq(min(vath$elev), max(vath$elev), length=15))
glmPred = cbind(glmPred, predict(vathElev2, newdata = glmPred, type='link', se=T)) %>%
mutate(pred = plogis(fit),
ucl = plogis(fit + 1.96*se.fit),
lcl = plogis(fit - 1.96*se.fit))
ggplot(glmPred, aes(x=elev, y=pred))+
geom_line()+
geom_line(aes(y = lcl), linetype='dashed')+
geom_line(aes(y = ucl), linetype='dashed')+
theme_bw()+
theme(panel.grid=element_blank())
We can also predict Varied Thrush occupancy in space by applying our fitted model to the elevation raster.
predRaster = predict(model = vathElev2, object=elevVars)
predRaster = exp(predRaster)/(1+exp(predRaster))
plot(predRaster)
And congratulations! Just like that you’ve fit your first species distribution model. I’m not really sure if it’s a good one, but it is one.
Because there are a lot of sampling points here (n = 805), and because the total sampling area covers a rather large extent, I’m modifying a correlog function so that we can set the maximum distance in which we are interested. Otherwise this can take a very long time because there are a lot of potential distance bins.
icorrelogram <- function(locations,z, binsize, maxdist){
distbin <- seq(0,maxdist,by=binsize)
Nbin <- length(distbin)-1
moran.results <- data.frame("dist"= rep(NA,Nbin), "Morans.i"=NA,"null.lower"=NA, "null.upper"=NA)
for (i in 1:Nbin){
d.start<-distbin[i]
d.end<-distbin[i+1]
neigh <- dnearneigh(x=locations, d1=d.start, d.end, longlat=F)
wts <- nb2listw(neighbours=neigh, style='B', zero.policy=T)
mor.i <- moran.mc(x=z, listw=wts, nsim=200, alternative="greater", zero.policy=T) #note alternative is for P-value, so only 'significant if positive autocorrelation
moran.results[i, "dist"]<-(d.end+d.start)/2
moran.results[i, "Morans.i"]<-mor.i$statistic #observed moran's i
moran.results[i, "null.lower"]<-quantile(mor.i$res, probs = 0.025,na.rm = T)#95% null envelope
moran.results[i, "null.upper"]<-quantile(mor.i$res, probs = 0.975,na.rm = T)#95% null envelope
}
return(moran.results)
}
We will now use this icorrelogram function to evaluate spatial dependence in the raw Varied Thrush presence/absence data.
vathCor = icorrelogram(locations=coords, z=vath$VATH, binsize=1000, maxdist=15000)
head(vathCor)
ggplot(vathCor, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
So looking at the raw data, there appears to be some substantial correlation in the first couple of distance bins because these values fall outside of our random confidence envelope. But, as we all know, our regression assumptions actually apply to the residuals. So what we really want to know is if there is correlation in the residuals of the model.
This is not the time nor the place to get into residuals for generalized linear models. They are complicated, there are 4 different types, and they can be a bit difficult to understand. For the purposes of our exploration here, we are going to rely on deviance residuals which are intuitively similar to the residuals you may be used to from a linear model.
So we’re going to pull out the deviance residuals from our best fitted model (which included the quadratic elevation effect). Then we’re going to run the same icorrelogram() function on those residuals.
vathElev2Res = residuals(vathElev2, type='deviance')
corResids = icorrelogram(locations = coords, z = vathElev2Res, binsize=1000, maxdist=15000)
ggplot(corResids, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
plot(vathCor$Morans.i, corResids$Morans.i)
abline(a=0, b=1, col='red')
So, by fitting the elevation model, we have NOT removed the spatial dependence from the data. We’re going to need another solution.
First, we can try the least-ideal solution. We can subset the points in an attempt to reduce the spatial dependence. Because points fall along transects, we may want to try sub-setting out a single point from each transect to get rid of all of the others that may be aggregated in space.
subData = vath %>%
mutate(randomVar = runif(nrow(.), min=0, max=1)) %>%
group_by(TRANSECT) %>%
filter(randomVar == min(randomVar)) %>%
ungroup()
subModel = glm(VATH ~ elev + I(elev^2), family='binomial', data=subData)
summary(subModel)
##
## Call:
## glm(formula = VATH ~ elev + I(elev^2), family = "binomial", data = subData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.299 5.082 -2.027 0.0427 *
## elev 14.197 8.104 1.752 0.0798 .
## I(elev^2) -5.722 3.177 -1.801 0.0717 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 126.33 on 166 degrees of freedom
## Residual deviance: 120.69 on 164 degrees of freedom
## AIC: 126.69
##
## Number of Fisher Scoring iterations: 6
Note that by taking this step and sub-setting the data, we have reduced our sample size from 805 to 167 sample points. As a result, the standard errors around our parameter estimates tripled, and the elevation parameters are no longer statistically significant. But…
resids = residuals(subModel, type='deviance')
correlation = icorrelogram(locations = cbind(subData$EASTING, subData$NORTHING), z=resids, binsize=2000, maxdist=15000)
ggplot(correlation, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
It appears as though we have fixed our spatial dependence problem. Of course, this has come with a cost. Perhaps another approach might be better in this particular situation.
Now we’re going to try several types of models that are going to help us account for spatial dependence. The first is the trend surface model. As we discussed, there are 2 approaches that fit into the “trend surface model” category. We can use polynomial regression, or we can use generalized additive models.
Here we’re going to start with polynomial regression and include a cubic model for the x and y coordinates.
polyModel = glm(VATH ~ elev + I(elev^2) + EASTING + NORTHING + I(EASTING^2) + I(EASTING^3) + I(NORTHING^2) + I(NORTHING^3), family='binomial', data=vath)
summary(polyModel)
##
## Call:
## glm(formula = VATH ~ elev + I(elev^2) + EASTING + NORTHING +
## I(EASTING^2) + I(EASTING^3) + I(NORTHING^2) + I(NORTHING^3),
## family = "binomial", data = vath)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.861e+00 6.332e+00 -1.557 0.11943
## elev 8.795e+00 3.138e+00 2.803 0.00507 **
## I(elev^2) -3.195e+00 1.248e+00 -2.559 0.01049 *
## EASTING 2.208e-04 4.769e-05 4.631 3.65e-06 ***
## NORTHING -8.018e-05 5.076e-05 -1.580 0.11420
## I(EASTING^2) -1.263e-09 2.806e-10 -4.502 6.72e-06 ***
## I(EASTING^3) 2.090e-15 5.122e-16 4.081 4.48e-05 ***
## I(NORTHING^2) 2.296e-10 1.366e-10 1.681 0.09275 .
## I(NORTHING^3) -2.049e-16 1.179e-16 -1.738 0.08216 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 584.34 on 804 degrees of freedom
## Residual deviance: 486.89 on 796 degrees of freedom
## AIC: 504.89
##
## Number of Fisher Scoring iterations: 6
resids = residuals(polyModel, type='deviance')
correlation = icorrelogram(locations = cbind(vath$EASTING, vath$NORTHING), z=resids, binsize=1000, maxdist=15000)
ggplot(correlation, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
And then we can try a GAM function. The gam() function in the mgcv package automates the choice of the spline function through cross validation. Basically, elevation is being modeled similarly to how it was being modeled above, and then the syntax I’ve used below automatically selects the number of knots to fit the best model for the X and Y coordinates.
gamModel = gam(VATH ~ elev + I(elev^2) + s(EASTING, NORTHING), family='binomial', data=vath)
summary(gamModel)
##
## Family: binomial
## Link function: logit
##
## Formula:
## VATH ~ elev + I(elev^2) + s(EASTING, NORTHING)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -26.794 5.857 -4.575 4.77e-06 ***
## elev 9.364 4.811 1.947 0.0516 .
## I(elev^2) -2.752 1.898 -1.449 0.1472
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(EASTING,NORTHING) 28.68 28.96 73.63 8.76e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.222 Deviance explained = 30.9%
## UBRE = -0.41984 Scale est. = 1 n = 805
resids = residuals(gamModel, type='deviance')
correlation = icorrelogram(locations = cbind(vath$EASTING, vath$NORTHING), z=resids, binsize=1000, maxdist=15000)
ggplot(correlation, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
The polynomial regression left much to be desired in terms of removing spatial dependence. However, the GAM model did a relatively decent job. That said, there still seems to be some residual dependence at low distances, and it might be worth exploring other options.
Frankly, this is complicated, but I’m going to show you how to do it anyways. First, we’re going to create a spanning tree which creates the minimum set of links necessary to ensure all points are connected.
spantreeEm = spantree(dist(coords), toolong=0)
Now we’re going to identify neighborhoods using the maximum distance in the spantree as a threshold distance. This creates a list of spatial neighbors and then we can measure the distance among all of those points considered a spatial neighbor.
dnn = dnearneigh(coords, 0, max(spantreeEm$dist))
dnnDists = nbdists(dnn, coords)
So now we’re going to transform the distances based on a recommendation from Dormann et al. (2007; https://nsojournals.onlinelibrary.wiley.com/doi/full/10.1111/j.2007.0906-7590.05171.x) and then use that to create a weights matrix.
dnnTransform = lapply(dnnDists, function(x) (1-((x/4)^2)))
weights = nb2listw(dnn, glist=dnnTransform, style='B', zero.policy=T)
Now we use the ME() function in the spatialreg package to find the most important eigenvectors that reduce spatial dependence. ME() does this using a brute force approach to evaluate what combination of eigenvectors reduces spatial dependence the most.
vathMe = ME(VATH ~ elev + I(elev^2), family='binomial', listw=weights, data=vath)
vathMe$selection
## Eigenvector ZI pr(ZI)
## 0 NA NA 0.01
## 1 796 NA 0.01
## 2 804 NA 0.02
## 3 805 NA 0.20
head(fitted(vathMe),2)
## vec796 vec804 vec805
## [1,] -0.003042641 -0.008629250 0.01187249
## [2,] -0.003088077 -0.008737222 0.01196633
We’ve now identified 3 eigenvectors that help us reduce spatial dependence. So the last step is to include those eigenvectors into our fitted model and explore the residual spatial dependence.
emModel = glm(VATH ~ elev + I(elev^2) + fitted(vathMe), family='binomial', data=vath)
summary(emModel)
##
## Call:
## glm(formula = VATH ~ elev + I(elev^2) + fitted(vathMe), family = "binomial",
## data = vath)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.401 1.948 -4.312 1.62e-05 ***
## elev 8.776 3.029 2.898 0.00376 **
## I(elev^2) -3.112 1.168 -2.664 0.00773 **
## fitted(vathMe)vec796 14.742 3.198 4.610 4.03e-06 ***
## fitted(vathMe)vec804 -8.644 3.242 -2.666 0.00767 **
## fitted(vathMe)vec805 38.110 8.789 4.336 1.45e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 584.34 on 804 degrees of freedom
## Residual deviance: 499.73 on 799 degrees of freedom
## AIC: 511.73
##
## Number of Fisher Scoring iterations: 7
resids = residuals(emModel, type='deviance')
correlation = icorrelogram(locations = cbind(vath$EASTING, vath$NORTHING), z=resids, binsize=1000, maxdist=15000)
ggplot(correlation, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
After all that, this one didn’t do a great job for us.
We’re going to calculate our autocovariate to include as an explanatory variable in our models. Recall, that we are calculating the autocovariate as a weighted mean of the response variables around it. Using this is an explanatory variable acknowledges that the values nearby affect the values locally. Because most of the autocorrelation we’ve observed occurs within 1 km, we are going to use that as the neighborhood in which we consider the values at other points.
auto1km = autocov_dist(vath$VATH, coords, nbs=1000, type='one', style='B', zero.policy=T)
## Warning in autocov_dist(vath$VATH, coords, nbs = 1000, type = "one", style =
## "B", : With value 1000 some points have no neighbours
We can now include this autocovariate as an explanatory variable in our logistic regression equation.
auto1kmModel = glm(VATH ~ elev + I(elev^2) + auto1km, family='binomial', data=vath)
summary(auto1kmModel)
##
## Call:
## glm(formula = VATH ~ elev + I(elev^2) + auto1km, family = "binomial",
## data = vath)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.6518 1.9660 -3.383 0.000716 ***
## elev 6.9046 3.1222 2.211 0.027006 *
## I(elev^2) -2.8061 1.2106 -2.318 0.020450 *
## auto1km 0.8665 0.1008 8.596 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 584.34 on 804 degrees of freedom
## Residual deviance: 470.99 on 801 degrees of freedom
## AIC: 478.99
##
## Number of Fisher Scoring iterations: 6
resids = residuals(auto1kmModel, type='deviance')
correlation = icorrelogram(locations = cbind(vath$EASTING, vath$NORTHING), z=resids, binsize=1000, maxdist=15000)
ggplot(correlation, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
This approach has removed the preponderance of the spatial dependence. It has also reduced the effect sizes and the p-values associated with the elevation variable.
Fitting autoregressive models with non-normal error terms is computationally challenging. Because of that, many folks use Bayesian models, simply because it is easier to program these complicated models in Bayesian software, and not because inference lends itself to Bayesian philosophy. The spBayes package can be used for spatial regression, but again, this requires a lot of computer power and run time. A shortcut is to use “Integrated nested Leplace approximation” or INLA. However, this is only available for certain types of Bayesian models. Fortunately, we can use it to fit CAR moels for binary data.
Step 1 is to create Thiessen or Voroni polygons from the point data. These polygons partition a region into convex polygons such that each one contains exactly one point.
thiessen = voronoi(coords)
tmp = st_as_sf(thiessen)
ggplot()+
geom_sf(data = tmp)+
geom_point(data = vath, aes(x=EASTING, y=NORTHING), color='red')
Then we create a matrix of connected points/polygons and format it into the kind of matrix we can use to fit a CAR model.
pointPoly = poly2nb(thiessen)
plot(thiessen)
points(coords, col='red')
plot(pointPoly, coords, col='red', add=T)
adj = nb2mat(pointPoly, style='B')
adj = as(as(as(adj, "dMatrix"), "generalMatrix"), "TsparseMatrix")
And we fit the CAR model using the inla() function. We need to specify the type of model we are fitting and the covariates. For a CAR model, we add an observation-level covariate (SURVEYID) and specify ‘besag’.
inlaModel = inla(VATH ~ elev + I(elev^2) + f(SURVEYID, model='besag', graph=adj), family='binomial', data=vath, control.predictor=list(compute=T))
summary(inlaModel)
##
## Call:
## c("inla.core(formula = formula, family = family, contrasts = contrasts,
## ", " data = data, quantiles = quantiles, E = E, offset = offset, ", "
## scale = scale, weights = weights, Ntrials = Ntrials, strata = strata,
## ", " lp.scale = lp.scale, link.covariates = link.covariates, verbose =
## verbose, ", " lincomb = lincomb, selection = selection, control.compute
## = control.compute, ", " control.predictor = control.predictor,
## control.family = control.family, ", " control.inla = control.inla,
## control.fixed = control.fixed, ", " control.mode = control.mode,
## control.expert = control.expert, ", " control.hazard = control.hazard,
## control.lincomb = control.lincomb, ", " control.update =
## control.update, control.lp.scale = control.lp.scale, ", "
## control.pardiso = control.pardiso, only.hyperparam = only.hyperparam,
## ", " inla.call = inla.call, inla.arg = inla.arg, num.threads =
## num.threads, ", " keep = keep, working.directory = working.directory,
## silent = silent, ", " inla.mode = inla.mode, safe = FALSE, debug =
## debug, .parent.frame = .parent.frame)" )
## Time used:
## Pre = 1.35, Running = 0.922, Post = 0.0904, Total = 2.36
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) -8.310 1.968 -12.170 -8.310 -4.450 -8.310 0
## elev 11.239 3.191 4.982 11.239 17.496 11.239 0
## I(elev^2) -4.700 1.267 -7.184 -4.700 -2.217 -4.700 0
##
## Random effects:
## Name Model
## SURVEYID Besags ICAR model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant 0.975quant mode
## Precision for SURVEYID 19863.02 19927.34 572.47 13812.13 73877.26 199.59
##
## Marginal log-Likelihood: -899.24
## is computed
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
To look at the residuals from a fitted CAR model, we unfortunately have to calculate the deviance residuals by hand because they are not exported automatically. That said, this isn’t that hard.
inlaModelFit = inlaModel$summary.fitted.values$mean
si = ifelse(vath$VATH==1, 1, -1)
resids = si*(-2*(vath$VATH*log(inlaModelFit) + (1-vath$VATH) * log(1-inlaModelFit)))^0.5
correlation = icorrelogram(locations = cbind(vath$EASTING, vath$NORTHING), z=resids, binsize=1000, maxdist=15000)
ggplot(correlation, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
And again, after some complex work, we’re seeing we haven’t really eliminated the spatial dependence in our residuals.
There is a natural aggregation or level here in these data. Recall that transects were randomly placed, and then from there the points along the transect were systematic. So now we are going to try and account for the spatial dependence by using a random transect variable. Because this is not a spatially explicit model, we are assuming that the spatial dependence is consistent within a transect, regardless of how far apart two points are located.
vath = vath %>% mutate(TRANSECT = as.factor(as.character(TRANSECT)))
glmerModel = lme4::glmer(VATH ~ elev + I(elev^2) + (1|TRANSECT), family='binomial', data=vath)
summary(glmerModel)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: VATH ~ elev + I(elev^2) + (1 | TRANSECT)
## Data: vath
##
## AIC BIC logLik deviance df.resid
## 498.4 517.2 -245.2 490.4 801
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3520 -0.1755 -0.1541 -0.1129 5.7688
##
## Random effects:
## Groups Name Variance Std.Dev.
## TRANSECT (Intercept) 4.456 2.111
## Number of obs: 805, groups: TRANSECT, 167
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.470 3.258 -2.600 0.00934 **
## elev 9.459 5.149 1.837 0.06621 .
## I(elev^2) -4.043 1.977 -2.045 0.04083 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) elev
## elev -0.981
## I(elev^2) 0.946 -0.988
resids = residuals(glmerModel, type='deviance')
correlation = icorrelogram(locations = cbind(vath$EASTING, vath$NORTHING), z=resids, binsize=1000, maxdist=15000)
ggplot(correlation, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
That random transect effect pretty much took care of the positive spatial autocorrelation, although we now have some negative autocorrelation in the residuals at short distances.
When you are fitting a model that assumes normally distributed residuals, you can use the nlme package to model spatial correlation structures in the residuals. Unfortunately, it’s a bit more challenging for generalized linear models. We have to rely on penalized quasi-likelihood to estimate parameters. This approach has some poor properties (Rousset and Ferdy 2014 - https://nsojournals.onlinelibrary.wiley.com/doi/full/10.1111/ecog.00566), and we cannot compare models using likelihood-based approaches if they are fit this way. So why do it? I guess the answer is that it’s the best option we’ve got, and it gives us a mechanism for estimating environmental relationships while accounting for spatial dependence.
Here we’re fitting an exponential correlation function within transects and ignoring any dependence among sites that are not on the same transect.
\(\color{red}{\text{CAVEAT: We're into some weird and theoretically tenuous stuff here. I THINK I've pulled the residuals out and tested them correctly below, but you may want to dive a little deeper if you're actually going to use this tool in a publication.}}\)
spatMixModel = glmmPQL(VATH ~ elev + I(elev^2), random = ~1|TRANSECT, correlation = corExp(form = ~ EASTING + NORTHING), family='binomial', data = vath)
## iteration 1
## iteration 2
## iteration 3
## iteration 4
## iteration 5
## iteration 6
## iteration 7
## iteration 8
summary(spatMixModel)
## Linear mixed-effects model fit by maximum likelihood
## Data: vath
## AIC BIC logLik
## NA NA NA
##
## Random effects:
## Formula: ~1 | TRANSECT
## (Intercept) Residual
## StdDev: 1.814525 0.647076
##
## Correlation Structure: Exponential spatial correlation
## Formula: ~EASTING + NORTHING | TRANSECT
## Parameter estimate(s):
## range
## 105.4574
## Variance function:
## Structure: fixed weights
## Formula: ~invwt
## Fixed effects: VATH ~ elev + I(elev^2)
## Value Std.Error DF t-value p-value
## (Intercept) -6.244332 2.502019 636 -2.495718 0.0128
## elev 6.666340 3.924825 636 1.698506 0.0899
## I(elev^2) -2.931414 1.502899 636 -1.950507 0.0516
## Correlation:
## (Intr) elev
## elev -0.984
## I(elev^2) 0.946 -0.987
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.4749550 -0.2837082 -0.2617274 -0.2054926 6.8496399
##
## Number of Observations: 805
## Number of Groups: 167
resids = resid(spatMixModel)
correlation = icorrelogram(locations = cbind(vath$EASTING, vath$NORTHING), z=resids, binsize=1000, maxdist=15000)
ggplot(correlation, aes(x=dist, y=Morans.i))+
geom_line()+
geom_point()+
ylim(-0.5, 0.5)+
theme_bw()+
geom_hline(yintercept=0, color='red', linetype='dashed')+
geom_line(aes(y=null.lower), linetype='dashed')+
geom_line(aes(y=null.upper), linetype='dashed')
And we can also fit a exponential correlation function across the whole study area and ignore transects. I’ve provided code below, but I’m not going to run it here because it takes quite a long time.
# vath = vath %>%
# mutate(group = 'obs')
#
# glsModel = glmmPQL(VATH ~ elev + I(elev^2), random = ~1|group, correlation=corExp(form = ~EASTING + NORTHING), family='binomial', data=vath)
\(\color{red}{\text{CAVEAT: This was haphazardly created minutes before class... I need to revisit, but it gets the point across.}}\)
tmp = rbind(data.frame(summary(vathElev2)$coefficients[2:3,]) %>% mutate('model'='vathElev2'),
data.frame(summary(subModel)$coefficients[2:3,]) %>% mutate('model'='subModel'),
data.frame(summary(polyModel)$coefficients[2:3,]) %>% mutate('model'='polyModel'),
data.frame(summary(gamModel)$coefficients[2:3,]) %>% mutate('model'='gamModel'),
data.frame(summary(emModel)$coefficients[2:3,]) %>% mutate('model'='emModel'),
data.frame(summary(auto1kmModel)$coefficients[2:3,]) %>% mutate('model'='auto1kmModel'),
data.frame(summary(glmerModel)$coefficients[2:3,]) %>% mutate('model'='glmerModel')) %>%
mutate(param = rep(c('elev', 'elev2'), 6)) %>%
mutate(lcl = Estimate - 1.96*Std..Error,
ucl = Estimate + 1.96*Std..Error)
ggplot(tmp, aes(x=model, y=Estimate))+
facet_wrap(~param)+
coord_flip()+
geom_point()+
geom_errorbar(aes(ymin = lcl, ymax=ucl))+
theme_bw()+
theme(panel.grid=element_blank())+
geom_hline(yintercept=0, color='red', linetype='dashed')
# summary(spatMixModel)
# summary(inlaModel)$fixed[2:3,]